package com.cmge.ad.spider;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.ConcurrentLinkedQueue;

import org.springframework.util.StringUtils;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import com.cmge.ad.cache.ehcache.EhCacheCacheManager;
import com.cmge.ad.model.AlbumStore;
import com.cmge.ad.model.CrawlAlbum;
import com.cmge.ad.model.CrawlLog;
import com.cmge.ad.model.PictureStore;
import com.cmge.ad.service.CrawlService;
import com.cmge.ad.util.SuprUtil;
import com.cmge.ad.util.TimeUtils;
import com.cmge.ad.util.context.SpringContextUtil;

/**
 * @desc	相册列表、详情页列表模式抓取器
 * 
 * 		1、抓取列表
 * 		2、列表中获取详情页的url
 * 		3、详情页获取最大、最小值
 * 		4、依次拼装获取之间的页面
 * 		5、取每个具体页面的图片路径
 * 
 * 		结构拆分：
 * 		1、配置项
 * 		2、日志监控项
 * 		3、数据统计项
 * 
 * @author	ljt
 * @time	2014-12-29 上午11:16:05
 */
public class AlbumListPageProcessor implements PageProcessor {
	
	private CrawlService crawlService;
	
	private EhCacheCacheManager cacheManager;
	
	private Spider spider;
	
    private Site site = Site.me().setRetryTimes(3)
    							 .setUserAgent("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36");

    private HashMap<String,CrawlAlbum> map = new HashMap<String,CrawlAlbum>();
    
    // 爬虫任务编号  和爬虫任务表主键Id一致
    private String id;
    
    // 休眠时间
    private int sleepTime;
    
    // 用来校验正则的
    private String listUrl;
    
    private String infoUrlRegular;
    
    private String nextUrlPre;
    
    private String infoFirstRegular;
    
    private String infoListNextXpath;
    
    private String listUrlRegular;
    
    // list页面url模板
    private String listUrlTemplate;
    
    // 用来校验正则的
    private String infoUrl;

    // list起始页数
    private int start;
	
    // list结束页数
	private int end;

	// 详情页url的xpath
	private String infoUrlXpath;
	
	// 详情页url前缀
	private String infoUrlPre;
	
	// 段子内容xpath
	private String infoContentXpath;
	
	// 段子图片xpath
	private String infoPicXpath;
	
	// 详情页重复标识字段xpath
	private String repeatFlagXpath;
	
	// 重复次数上限
	private int repeatMaxTimes;
	
	// 当前重复次数
	private int repeatTime;
	
	private String albumIdSign;
	
	private Integer siteId;
	
	private Integer siteCategoryId;
	
	private String titleXpath;
	
	// list页加载开关
	private boolean flag = true;
	
	// 全局开关  重复
	private boolean repeatFlag = true;
	
	public AlbumListPageProcessor() {
		super();
		this.crawlService = SpringContextUtil.getApplicationContext().getBean(CrawlService.class);
		this.cacheManager = SpringContextUtil.getApplicationContext().getBean(EhCacheCacheManager.class);
	}

	@Override
    public void process(Page page) {
		if(repeatFlag){
			// 当前页如果是list页
			if (page.getUrl().regex(listUrlRegular).match()) {
	        	if(flag){
	        		putCache("CrawlLog", id, "+++++++++++<font style='color:red'>开始加载列表页面</font>+++++++++++");
	        		// 加载所有list页面
	        		List<String> listUrl = new ArrayList<String>();
	        		for(int i = start;i<=end;i++){
	        			String url = listUrlTemplate.replace("$1", String.valueOf(i));
	        			putCache("CrawlLog", id, "加载[<font style='color:red'>"+url+"</font>]成功");
	        			listUrl.add(url);
	        		}
	        		page.addTargetRequests(listUrl);
	        		flag = false;
	        		putCache("CrawlLog", id, "+++++++++++<font style='color:red'>加载列表页面结束</font>+++++++++++");
	        	}
	        	
	        	List<String> infoUrlList = null;
	        	List<String> newInfoUrlList = new ArrayList<String>();
	        	// 查找当前list页的所有info页url
	        	if(!StringUtils.isEmpty(infoUrlRegular) && !infoUrlRegular.equals("\\w+")){
	        		infoUrlList = page.getHtml().xpath(infoUrlXpath).regex(infoUrlRegular).all();
	        	}else{
	        		infoUrlList = page.getHtml().xpath(infoUrlXpath).all();
	        	}
	        	
	        	if(!SuprUtil.isEmptyCollection(infoUrlList)){
	        		if(!StringUtils.isEmpty(infoUrlPre)){
	        			for(String str : infoUrlList){
	        				page.addTargetRequest(infoUrlPre + str);
	        			}
	        		}else{
	        			// 相册url唯一性校验
	        			for(String uniqueId : infoUrlList){
	        				String albumUniqueId = null;
	        	        	// 如果是首页地址  获取相册唯一标识
	        	        	if(uniqueId.contains(albumIdSign)){
	        	        		albumUniqueId = uniqueId.split(albumIdSign)[0];
	        	        	}else{
	        	        		albumUniqueId = uniqueId.split(".h")[0];
	        	        	}
	        				
	        				boolean flag = crawlService.isExistAlbumUniqueId(albumUniqueId);
			        		if(flag){
			        			repeatTime++;
			        			if(repeatTime >= repeatMaxTimes){
			        				putCache("CrawlLog", id, "重复次数大于最大上限<font style='color:red'>["+repeatMaxTimes+"]</font>，退出爬虫");
			        				// 退出爬虫
			        				repeatFlag = false;
			        				spider.stop();
			        			}else{
			        				putCache("CrawlLog", id, "唯一标识<font style='color:red'>["+uniqueId+"]</font>已存在,当前重复次数："+repeatTime+"次");
			        				page.setSkip(true);
			        				continue;
			        			}
			        		}else{
			        			newInfoUrlList.add(uniqueId);
			        		}
	        			}
	        			
	        			if(!SuprUtil.isEmptyCollection(newInfoUrlList)){
	        				page.addTargetRequests(newInfoUrlList);
	        				putCache("CrawlLog", id, "加载<font style='color:red'>["+page.getUrl().toString()+"]</font>详情页成功，数据大小："+newInfoUrlList.size());
	        			}
	        		}
	        	}
	        }else if(page.getUrl().regex(infoFirstRegular).match() && !page.getUrl().toString().contains(albumIdSign)){
	        	putCache("CrawlLog", id, "加载相册首页地址<font style='color:red'>["+page.getUrl()+"]</font>成功");
	        	
	        	String albumUniqueId = null;
	        	// 如果是首页地址  获取相册唯一标识
	        	if(page.getUrl().toString().contains(albumIdSign)){
	        		albumUniqueId = page.getUrl().toString().split(albumIdSign)[0];
	        	}else{
	        		albumUniqueId = page.getUrl().toString().split(".h")[0];
	        	}
	        	
	        	putCache("CrawlLog", id, "创建相册对象，唯一标识为:<font style='color:red'>["+albumUniqueId+"]</font>");
	        	// 创建相册
	        	CrawlAlbum crawlAlbum = new CrawlAlbum();
	        	// 获取当前页的图片
	        	String currentUrl = page.getHtml().xpath(infoPicXpath).get();
	        	HashSet<String> picList = new HashSet<String>();
	        	HashSet<String> picInfoList = new HashSet<String>();
	        	if(!StringUtils.isEmpty(currentUrl)){
	        		putCache("CrawlLog", id, "添加图片<font style='color:red'>["+currentUrl+"]</font>到相册<font style='color:red'>["+albumUniqueId+"]</font>中");
	        		picList.add(currentUrl);
	        		picInfoList.add(page.getUrl().toString());
	        	}

	        	crawlAlbum.setPicList(picList);
	        	crawlAlbum.setPicInfoList(picInfoList);
	        	crawlAlbum.setUniqueId(albumUniqueId);
	        	map.put(albumUniqueId, crawlAlbum);
	        	
	        	int index = 10;
	        	// 找到下一页值
	        	String nextUrl = retryGetNextUrl(page,index);
	        	if(!StringUtils.isEmpty(nextUrl)){
	        		putCache("CrawlLog", id, "获取下一页地址：<font style='color:red'>["+nextUrl+"]</font>");
	        		page.addTargetRequest(nextUrl);
	        	}
	        } else {
	        	// 不是首页的内容页面   获取当前页的图片
	        	String currentUrl = page.getHtml().xpath(infoPicXpath).get();
	        	// 根据分隔符获取相册唯一key
	        	String albumUniqueId = page.getUrl().toString().split(albumIdSign)[0];
	        	// 根据key获取对象
	        	CrawlAlbum crawlAlbum = map.get(albumUniqueId);
	        	if(null != crawlAlbum){
	        		putCache("CrawlLog", id, "获取到相册对象，唯一标识为:<font style='color:red'>["+albumUniqueId+"]</font>");
	        		HashSet<String> picList = crawlAlbum.getPicList();
	        		HashSet<String> picInfoList = new HashSet<String>();
	        		picList.add(currentUrl);
	        		picInfoList.add(page.getUrl().toString());
	        		putCache("CrawlLog", id, "向相册中添加图片，图片地址为:<font style='color:red'>["+currentUrl+"]</font>");
	        	}
	        	
	        	int index = 10;
	        	// 找到下一页值
	        	String nextUrl = retryGetNextUrl(page,index);
	        	if(!StringUtils.isEmpty(nextUrl)){
	        		HashSet<String> picInfoList = crawlAlbum.getPicInfoList();
	        		if(!picInfoList.contains(nextUrl)){
	        			putCache("CrawlLog", id, "获取下一页地址：<font style='color:red'>["+nextUrl+"]</font>");
	        			picInfoList.add(nextUrl);
	        			page.addTargetRequest(nextUrl);
	        			return;
	        		}
	        	}
	        	
	        	putCache("CrawlLog", id, "没有下一页地址，保存相册到数据库...");
	        	
	        	// 如果是最后一页  则负责把相册取出 存入数据库
        		crawlAlbum = map.get(albumUniqueId);
        		
        		try {
					// 创建相册
					AlbumStore albumStore = new AlbumStore();
					albumStore.setTitle(crawlAlbum.getAlbumTitle());
					albumStore.setSiteId(siteId);
					albumStore.setSiteCategoryId(siteCategoryId);
					albumStore.setUniqueId(crawlAlbum.getUniqueId());
					crawlService.addAlbumStore(albumStore);
					// 遍历图片  创建图片
					PictureStore ps = new PictureStore();
					for(String picUrl : crawlAlbum.getPicList()){
						ps.setAlbumId(albumStore.getId());
						ps.setMinImageUrl(picUrl);
						ps.setMaxImageUrl(picUrl);
						crawlService.addPictureStore(ps);
					}
				} catch (Exception e) {
					putCache("CrawlLog", id, "保存相册到数据库异常，异常信息："+e.getMessage());
				}
	        }
		}
    }

	/**
	 * 重复获取下一页地址
	 * @param page
	 * @param index
	 * @return
	 */
	private String retryGetNextUrl(Page page, int index) {
		String nextUrl = null;
		if(index > 0){
			if(!StringUtils.isEmpty(infoListNextXpath)){
				nextUrl = page.getHtml().xpath(infoListNextXpath).get();
			}else{
				// 找到下一页值
				nextUrl = page.getHtml().xpath(nextUrlPre+"/@href").nodes().get(page.getHtml().xpath(nextUrlPre+"/@href").nodes().size()-1).toString();
			}
			
			index--;
			if(!StringUtils.isEmpty(nextUrl) && nextUrl.indexOf(albumIdSign) > 0){
				return nextUrl;
			}else{
				return retryGetNextUrl(page,index);
			}
		}
		
		return null;
	}

	// 停止爬虫任务
	public void stopCrawl(){
		spider.stop();
	}

	/**
	 * 日志放入缓存中
	 * @param region
	 * @param key
	 * @param log
	 */
    private void putCache(String region, String key, String log) {
    	ConcurrentLinkedQueue<CrawlLog> queue = null;
    	Object obj = cacheManager.get(region, key);
    	if(obj == null){
    		queue = new ConcurrentLinkedQueue<CrawlLog>();
    	}else{
    		queue = (ConcurrentLinkedQueue<CrawlLog>)obj;
    	}
    	
    	CrawlLog crawlLog = new CrawlLog();
    	crawlLog.setLog(log);
    	crawlLog.setTime(TimeUtils.getTime());
    	// 最新的日志加载最前面
    	queue.add(crawlLog);
		cacheManager.put(region, key, queue);
	}

	public String getListUrl() {
		return listUrl;
	}

	public AlbumListPageProcessor setListUrl(String listUrl) {
		this.listUrl = listUrl;
		return this;
	}

	public String getInfoUrl() {
		return infoUrl;
	}

	public AlbumListPageProcessor setInfoUrl(String infoUrl) {
		this.infoUrl = infoUrl;
		return this;
	}

	@Override
    public Site getSite() {
        return site;
    }
    
    public String getListUrlTemplate() {
		return listUrlTemplate;
	}

	public AlbumListPageProcessor setListUrlTemplate(String listUrlTemplate) {
		this.listUrlTemplate = listUrlTemplate;
		return this;
	}

	public int getStart() {
		return start;
	}

	public AlbumListPageProcessor setStart(int start) {
		this.start = start;
		return this;
	}

	public int getEnd() {
		return end;
	}

	public AlbumListPageProcessor setEnd(int end) {
		this.end = end;
		return this;
	}

	public String getInfoUrlXpath() {
		return infoUrlXpath;
	}

	public AlbumListPageProcessor setInfoUrlXpath(String infoUrlXpath) {
		this.infoUrlXpath = infoUrlXpath;
		return this;
	}

	public String getInfoUrlPre() {
		return infoUrlPre;
	}

	public AlbumListPageProcessor setInfoUrlPre(String infoUrlPre) {
		this.infoUrlPre = infoUrlPre;
		return this;
	}

	public String getInfoContentXpath() {
		return infoContentXpath;
	}

	public AlbumListPageProcessor setInfoContentXpath(String infoContentXpath) {
		this.infoContentXpath = infoContentXpath;
		return this;
	}

	public String getInfoPicXpath() {
		return infoPicXpath;
	}

	public AlbumListPageProcessor setInfoPicXpath(String infoPicXpath) {
		this.infoPicXpath = infoPicXpath;
		return this;
	}

	public String getRepeatFlagXpath() {
		return repeatFlagXpath;
	}

	public AlbumListPageProcessor setRepeatFlagXpath(String repeatFlagXpath) {
		this.repeatFlagXpath = repeatFlagXpath;
		return this;
	}

	public int getRepeatMaxTimes() {
		return repeatMaxTimes;
	}

	public AlbumListPageProcessor setRepeatMaxTimes(int repeatMaxTimes) {
		this.repeatMaxTimes = repeatMaxTimes;
		return this;
	}

	public AlbumListPageProcessor setSite(Site site) {
		this.site = site;
		return this;
	}
	
	public Integer getSiteId() {
		return siteId;
	}

	public AlbumListPageProcessor setSiteId(Integer siteId) {
		this.siteId = siteId;
		return this;
	}

	public Integer getSiteCategoryId() {
		return siteCategoryId;
	}

	public AlbumListPageProcessor setSiteCategoryId(Integer siteCategoryId) {
		this.siteCategoryId = siteCategoryId;
		return this;
	}

	public int getSleepTime() {
		return sleepTime;
	}

	public AlbumListPageProcessor setSleepTime(int sleepTime) {
		this.sleepTime = sleepTime;
		site.setSleepTime(sleepTime);
		return this;
	}

	public int getRepeatTime() {
		return repeatTime;
	}

	public AlbumListPageProcessor setRepeatTime(int repeatTime) {
		this.repeatTime = repeatTime;
		return this;
	}

	public Spider getSpider() {
		return spider;
	}

	public void setSpider(Spider spider) {
		this.spider = spider;
	}

	public String getId() {
		return id;
	}

	public AlbumListPageProcessor setId(String id) {
		this.id = id;
		return this;
	}

	public String getInfoUrlRegular() {
		return infoUrlRegular;
	}

	public AlbumListPageProcessor setInfoUrlRegular(String infoUrlRegular) {
		this.infoUrlRegular = infoUrlRegular;
		return this;
	}

	public String getListUrlRegular() {
		return listUrlRegular;
	}

	public AlbumListPageProcessor setListUrlRegular(String listUrlRegular) {
		this.listUrlRegular = listUrlRegular;
		return this;
	}

	public String getInfoFirstRegular() {
		return infoFirstRegular;
	}

	public AlbumListPageProcessor setInfoFirstRegular(String infoFirstRegular) {
		this.infoFirstRegular = infoFirstRegular;
		return this;
	}

	public String getAlbumIdSign() {
		return albumIdSign;
	}

	public AlbumListPageProcessor setAlbumIdSign(String albumIdSign) {
		this.albumIdSign = albumIdSign;
		return this;
	}

	public String getInfoListNextXpath() {
		return infoListNextXpath;
	}

	public AlbumListPageProcessor setInfoListNextXpath(String infoListNextXpath) {
		this.infoListNextXpath = infoListNextXpath;
		return this;
	}

	public String getTitleXpath() {
		return titleXpath;
	}

	public AlbumListPageProcessor setTitleXpath(String titleXpath) {
		this.titleXpath = titleXpath;
		return this;
	}

	public String getNextUrlPre() {
		return nextUrlPre;
	}

	public AlbumListPageProcessor setNextUrlPre(String nextUrlPre) {
		this.nextUrlPre = nextUrlPre;
		return this;
	}
}
